## Loading required modules
options(warn=-1)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
## Loading the survey data

df <- read.csv('kaggle_survey_2020_responses.csv')
dim(df)
## [1] 20037   355
str(df)
## 'data.frame':    20037 obs. of  355 variables:
##  $ Time.from.Start.to.Finish..seconds.: chr  "Duration (in seconds)" "1838" "289287" "860" ...
##  $ Q1                                 : chr  "What is your age (# years)?" "35-39" "30-34" "35-39" ...
##  $ Q2                                 : chr  "What is your gender? - Selected Choice" "Man" "Man" "Man" ...
##  $ Q3                                 : chr  "In which country do you currently reside?" "Colombia" "United States of America" "Argentina" ...
##  $ Q4                                 : chr  "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?" "Doctoral degree" "Master’s degree" "Bachelor’s degree" ...
##  $ Q5                                 : chr  "Select the title most similar to your current role (or most recent title if retired): - Selected Choice" "Student" "Data Engineer" "Software Engineer" ...
##  $ Q6                                 : chr  "For how many years have you been writing code and/or programming?" "5-10 years" "5-10 years" "10-20 years" ...
##  $ Q7_Part_1                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python" "Python" "Python" "" ...
##  $ Q7_Part_2                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R" "R" "R" "" ...
##  $ Q7_Part_3                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL" "SQL" "SQL" "" ...
##  $ Q7_Part_4                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C" "C" "" "" ...
##  $ Q7_Part_5                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++" "" "" "" ...
##  $ Q7_Part_6                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Java" "" "" "Java" ...
##  $ Q7_Part_7                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Javascript" "Javascript" "" "Javascript" ...
##  $ Q7_Part_8                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Julia" "" "" "" ...
##  $ Q7_Part_9                          : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Swift" "" "" "" ...
##  $ Q7_Part_10                         : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Bash" "" "" "Bash" ...
##  $ Q7_Part_11                         : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - MATLAB" "MATLAB" "" "" ...
##  $ Q7_Part_12                         : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - None" "" "" "" ...
##  $ Q7_OTHER                           : chr  "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Other" "Other" "" "" ...
##  $ Q8                                 : chr  "What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice" "Python" "Python" "R" ...
##  $ Q9_Part_1                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "Jupyter (JupyterLab, Jupyter Notebooks, etc) " "" "" ...
##  $ Q9_Part_2                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "" ...
##  $ Q9_Part_3                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "Visual Studio" "" ...
##  $ Q9_Part_4                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "Visual Studio Code (VSCode)" "" "Visual Studio Code (VSCode)" ...
##  $ Q9_Part_5                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" " PyCharm " "" ...
##  $ Q9_Part_6                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "  Spyder  " "" "" ...
##  $ Q9_Part_7                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "  Notepad++  " ...
##  $ Q9_Part_8                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "  Sublime Text  " "  Sublime Text  " ...
##  $ Q9_Part_9                          : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "  Vim / Emacs  " ...
##  $ Q9_Part_10                         : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "" ...
##  $ Q9_Part_11                         : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "" ...
##  $ Q9_OTHER                           : chr  "Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all "| __truncated__ "" "" "" ...
##  $ Q10_Part_1                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ " Kaggle Notebooks" "" "" ...
##  $ Q10_Part_2                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "Colab Notebooks" "Colab Notebooks" "" ...
##  $ Q10_Part_3                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_4                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_5                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_6                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_7                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_8                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_9                         : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_10                        : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_11                        : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_12                        : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q10_Part_13                        : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "None" ...
##  $ Q10_OTHER                          : chr  "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "" "" "" ...
##  $ Q11                                : chr  "What type of computing platform do you use most often for your data science projects? - Selected Choice" "A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)" "A personal computer or laptop" "A personal computer or laptop" ...
##  $ Q12_Part_1                         : chr  "Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice - GPUs" "GPUs" "GPUs" "" ...
##  $ Q12_Part_2                         : chr  "Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice - TPUs" "" "" "" ...
##  $ Q12_Part_3                         : chr  "Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice - None" "" "" "None" ...
##  $ Q12_OTHER                          : chr  "Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice - Other" "" "" "" ...
##  $ Q13                                : chr  "Approximately how many times have you used a TPU (tensor processing unit)?" "2-5 times" "2-5 times" "Never" ...
##  $ Q14_Part_1                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ " Matplotlib " " Matplotlib " "" ...
##  $ Q14_Part_2                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ "" " Seaborn " "" ...
##  $ Q14_Part_3                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
##  $ Q14_Part_4                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ "" " Ggplot / ggplot2 " "" ...
##  $ Q14_Part_5                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice -  Shiny " "" " Shiny " "" ...
##  $ Q14_Part_6                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice -  D3 js " "" "" " D3 js " ...
##  $ Q14_Part_7                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
##  $ Q14_Part_8                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice -  Bokeh " "" "" "" ...
##  $ Q14_Part_9                         : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ " Geoplotlib " "" "" ...
##  $ Q14_Part_10                        : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
##  $ Q14_Part_11                        : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice - None" "" "" "" ...
##  $ Q14_OTHER                          : chr  "What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice - Other" "" "" "" ...
##  $ Q15                                : chr  "For how many years have you used machine learning methods?" "1-2 years" "1-2 years" "I do not use machine learning methods" ...
##  $ Q16_Part_1                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "  Scikit-learn " "" ...
##  $ Q16_Part_2                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "  TensorFlow " "  TensorFlow " "" ...
##  $ Q16_Part_3                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ " Keras " " Keras " "" ...
##  $ Q16_Part_4                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" " PyTorch " "" ...
##  $ Q16_Part_5                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_6                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_7                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ " Xgboost " "" "" ...
##  $ Q16_Part_8                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_9                         : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_10                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_11                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_12                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_13                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_14                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_Part_15                        : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q16_OTHER                          : chr  "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
##  $ Q17_Part_1                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "Linear or Logistic Regression" "" ...
##  $ Q17_Part_2                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Decision Trees or Random Forests" "" "" ...
##  $ Q17_Part_3                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Gradient Boosting Machines (xgboost, lightgbm, etc)" "" "" ...
##  $ Q17_Part_4                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Bayesian Approaches" "" "" ...
##  $ Q17_Part_5                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "" "" ...
##  $ Q17_Part_6                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Dense Neural Networks (MLPs, etc)" "" "" ...
##  $ Q17_Part_7                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Convolutional Neural Networks" "Convolutional Neural Networks" "" ...
##  $ Q17_Part_8                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "" "" ...
##  $ Q17_Part_9                         : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Recurrent Neural Networks" "" "" ...
##  $ Q17_Part_10                        : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "Transformer Networks (BERT, gpt-3, etc)" "" ...
##  $ Q17_Part_11                        : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - None" "" "" "" ...
##  $ Q17_OTHER                          : chr  "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - Other" "" "" "" ...
##  $ Q18_Part_1                         : chr  "Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected "| __truncated__ "" "" "" ...
##  $ Q18_Part_2                         : chr  "Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected "| __truncated__ "" "Image segmentation methods (U-Net, Mask R-CNN, etc)" "" ...
##  $ Q18_Part_3                         : chr  "Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected "| __truncated__ "" "" "" ...
##  $ Q18_Part_4                         : chr  "Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected "| __truncated__ "Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)" "Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)" "" ...
##  $ Q18_Part_5                         : chr  "Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected "| __truncated__ "" "" "" ...
##   [list output truncated]
## Loading questions dataframe 

q_df <- read.csv('questions_dataframe.csv')
dim(q_df)
## [1] 354   7
str(q_df)
## 'data.frame':    354 obs. of  7 variables:
##  $ ques_num    : chr  "Q1" "Q10" "Q10" "Q10" ...
##  $ q_header    : chr  "Q1" "Q10_Part_1" "Q10_Part_2" "Q10_Part_3" ...
##  $ ques_keys   : chr  "Q1" "Q10" "Q10" "Q10" ...
##  $ ques_type   : chr  "single_answer" "multiple_answer" "multiple_answer" "multiple_answer" ...
##  $ question    : chr  "What is your age (# years)?" "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ "Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selec"| __truncated__ ...
##  $ missing_perc: num  0 14.2 14.2 14.2 14.2 ...
##  $ tag         : chr  "age" "Notebook Products" "Notebook Products" "Notebook Products" ...

Data Prep

## Creating Salary Buckets
df[df==""] <- NA
dat <- df[!is.na(df$Q24),]
dat <- dat[2:nrow(dat),]
dat$Time.from.Start.to.Finish..seconds. <- NULL
dim(dat)
## [1] 10729   354
v_low <- c('$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', "4,000-4,999", '5,000-7,499', 
           '7,500-9,999', '10,000-14,999', '15,000-19,999')
low <- c('20,000-24,999','25,000-29,999', '30,000-39,999','40,000-49,999')
medium <- c('50,000-59,999','60,000-69,999', '70,000-79,999','80,000-89,999', '90,000-99,999')
high <- c('100,000-124,999', '125,000-149,999')
v_high <- c('150,000-199,999', '200,000-249,999','250,000-299,999', '300,000-500,000', '> $500,000')

dat$Q24 <- ifelse(dat$Q24 %in% v_low, 'very low', 
                  ifelse(dat$Q24 %in% low, 'low', 
                         ifelse(dat$Q24 %in% medium, 'medium', 
                                ifelse(dat$Q24 %in% high, 'high', 'very high'))))

dat %>% count(dat[,'Q24'],sort = T)
##   dat[, "Q24"]    n
## 1     very low 5555
## 2       medium 1865
## 3          low 1806
## 4         high  888
## 5    very high  615
unique(q_df[,c('ques_keys','tag')])
##     ques_keys                                 tag
## 1          Q1                                 age
## 2         Q10                   Notebook Products
## 16        Q11                  Computing Platform
## 17        Q12                Specialized Hardware
## 21        Q13                           TPU Usage
## 22        Q14                 Visualisation Tools
## 34        Q15         Machine Learning Experience
## 35        Q16                        ML Framework
## 51        Q17                       ML Algorithms
## 63        Q18             Computer Vision Methods
## 70        Q19                         NLP Methods
## 76         Q2                              gender
## 77        Q20                        Company Size
## 78        Q21              Data Science Team Size
## 79        Q22                    ML in production
## 80        Q23                Work Responsibilites
## 88        Q24                              Salary
## 89        Q25                    ML & Cloud Spend
## 90      Q26_A           Cloud Computing Platforms
## 102     Q26_B           Cloud Computing Platforms
## 114     Q27_A            Cloud Computing Products
## 126     Q27_B            Cloud Computing Products
## 138     Q28_A           Machine Learning Products
## 149     Q28_B           Machine Learning Products
## 160     Q29_A                   Big Data Products
## 178     Q29_B                   Big Data Products
## 196        Q3                country of residence
## 197       Q30        Big Data Products Most Often
## 198     Q31_A                            BI Tools
## 213     Q31_B                            BI Tools
## 228       Q32                 BI Tools Most Often
## 229     Q33_A                       Auto ML Areas
## 237     Q33_B                       Auto ML Areas
## 245     Q34_A                       Auto ML Tools
## 257     Q34_B                       Auto ML Tools
## 269     Q35_A                Experiments Tracking
## 280     Q35_B                Experiments Tracking
## 291       Q36          Analysis Sharing Platforms
## 301       Q37                 DS Course Platforms
## 313       Q38                 Data Analysis Tools
## 314       Q39          Data Science Media Sources
## 326        Q4                           education
## 327        Q5                        current role
## 328        Q6                   coding experience
## 329        Q7               programming languages
## 342        Q8 programming language recommendation
## 343        Q9                                 IDE

Areas for analysis

  1. Education
    • Formal Education (Q4)
    • DS Course Platforms (Q37)
    • Analysis Sharing Platforms (Q36)
    • DS Media Sources (Q39)
  2. Where do you work?
    • Country of Residence (Q3)
    • Company Size (Q20)
    • DS Team Size (Q21)
    • ML in Production (Q22)
  3. Job Role & Experience
    • current role (Q5)
    • coding experience (Q6)
    • Machine Learning Experience (Q15)
    • Work Responsibilites (Q23)
  4. Tools of the Trade
    • Programming Languages (Q7)
    • Computing Platform (Q11)
    • Specialized Hardware (Q12)
    • Visualisation Tools (Q14)
    • ML Frameworks (Q16)
    • ML Algorithms (Q17)
    • Computer Vision Methods (Q18)
    • NLP Methods (Q19)
    • ML & Cloud Spend (Q25)
    • Cloud Computing Platforms (Q26_A)
    • Cloud Computing Products (Q27_A)
    • Machine Learning Products (Q28_A)
    • Big Data Products (Q29_A)
    • BI Tools (Q31_A)
    • Auto ML Tools (Q34_A)
    • Experiments Tracking (Q35_A)
## Helper Functions

# Calculating percentage share for a category
perc_share<- function(x){
  s <- sum(x)
  r <- round((x/s)*100,2)
  return(r)
}

# Computing contingency table for  2 categorical vars & converting counts to percetage values
contingency_table_pct <- function(ques_num_1, ques_num_2 = 'Q24'){
  if(q_df$ques_type[q_df$ques_keys == ques_num_1][1] == 'single_answer'){
    cross_tab <- table(dat[,ques_num_1],dat[,ques_num_2])
  }
  else{
    list_of_columns <- q_df$q_header[q_df$ques_keys == ques_num_1]
    cross_tab <- table(dat[,list_of_columns[1]], dat[,ques_num_2])
    for(col in list_of_columns[2:length(list_of_columns)]){
      result <- table(dat[,col], dat[,ques_num_2])
      cross_tab = rbind(cross_tab,result)
    }
  }
  cross_tab <- cross_tab[,c('very low', 'low', 'medium', 'high', 'very high')]
  cross_tab_pct <- apply(cross_tab,2,perc_share)
  return(cross_tab_pct)
}


# Side by side bar plots 
ss_barplot <- function(x, title, xlabel, ylabel){
  a <- data.frame(x)
  a$index <- row.names(a)
  a <- melt(a)
  p <- ggplot(a, aes(x=variable, y=value, fill=index)) + 
    geom_bar(stat='identity', position='dodge')
  p <- p + ggtitle(title) + labs(y=ylabel, x = xlabel)
  return(p)
}

# Multiple line plots
line_charts <- function(x, title, xlabel, ylabel){
  a <- data.frame(x)
  a$index <- row.names(a)
  a <- melt(a)
  p <- ggplot(a, aes(x=variable, y=value, group = index,color = index)) + geom_line()
  p <- p + ggtitle(title) + labs(y=ylabel, x = xlabel)
  return(p)
}

Education

  • Formal Education (Q4)
  • DS Course Platforms (Q37)
  • Analysis Sharing Platforms (Q36)
  • DS Media Sources (Q39)
## Salary vs formal education (Q4)

cross_tab_pct = contingency_table_pct('Q4', 'Q24')

row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'Some college/university study without earning a bachelor’s degree'] <- 'College experience'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'No formal education past high school'] <- 'High School'


title <- 'Education Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

While Master’s degree appears commonly across all income brackets, number of people with just a bachelor’s degree declines from low to high income groups. The number of doctorates see a steady rise from low to high income groups.

## Salary vs DS Course Platform (Q37)

cross_tab_pct = contingency_table_pct('Q37', 'Q24')

row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'Cloud-certification programs (direct from AWS, Azure, GCP, or similar)'] <- 'Cloud-certification programs'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'University Courses (resulting in a university degree)'] <- 'University Courses'

title <- 'DS Course Platform Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Popular course platforms (Coursera, edX, Fast.ai) providing quality content in complex areas like deep learning see steady rise from low to high income groups. While platforms with starter courses like Kaggle learn and Udemy observe more popularity among low income groups.

## Salary vs Analysis Sharing Platforms (Q36)

cross_tab_pct = contingency_table_pct('Q36', 'Q24')

title <- 'Analysis Sharing Platforms Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

GitHub remains the most popular choice across salary brackets for sharing analysis. However, with increase in income, activity on public platforms (kaggle, colab etc.) goes down.

## Salary vs DS Media Sources (Q39)

cross_tab_pct = contingency_table_pct('Q39', 'Q24')

title <- 'DS Media Sources Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

The high income bracket relies on Data Science experts to keep up with the industry trends. They do this by reading Journal publications, following people on twitter, signing up for email newsletters, podcasts and blogs. They avoid community platforms like youtube, slack communities or course forums.

Education | Analysis Recommendations

Where do you work?

## Salary vs Country of Residence (Q3)

countries <- c('India', 'United Kingdom of Great Britain and Northern Ireland',
              'United States of America', 'Brazil', ' Japan', 'Russia', 'Other',
              'Nigeria', 'China', 'Germany')

country_data <- dat[dat$Q3 %in% countries,]
cross_tab <- table(country_data[,'Q3'],country_data[,'Q24'])
cross_tab_pct <- apply(cross_tab,2,perc_share)

row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'United Kingdom of Great Britain and Northern Ireland'] <- 'UK'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'United States of America'] <- 'USA'

title <- 'Country of Residence Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

The USA is the place to be for Data Science professionals followed by developed countries like the UK and Germany. Develping countries like India, China etc. have less high paying roles.

## Salary vs Company Size (Q20)

cross_tab_pct = contingency_table_pct('Q20', 'Q24')

title <- 'Company Size Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Large companies with 1000 or more personnel are the best places to work for high income.

## Salary vs DS Team Size (Q21)

cross_tab_pct = contingency_table_pct('Q21', 'Q24')

title <- 'DS Team Size Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Large Data Science Teams (>15) are the go to places for high incomes.

## Salary vs ML in Production (Q22)

cross_tab_pct = contingency_table_pct('Q22', 'Q24')

title <- 'ML in Production Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Places using ML in production pay more than the ones not using it.

Where do you work? | Analysis Recommendations

Job Role & Experience

## Salary vs current role (Q5)

cross_tab_pct = contingency_table_pct('Q5', 'Q24')

title <- 'current role Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Data Scientist and Product Manager are the highest paying roles.

## Salary vs coding experience (Q6)

cross_tab_pct = contingency_table_pct('Q6', 'Q24')

title <- 'coding experience Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Coding experience pays. More experience translates to more pay.

## Salary vs Machine Learning Experience (Q15)

cross_tab_pct = contingency_table_pct('Q15', 'Q24')

title <- 'Machine Learning Experience Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

ML Experience pays. More experience translates to more money.

## Salary vs Work Responsibilities (Q23)

cross_tab_pct = contingency_table_pct('Q23', 'Q24')

title <- 'Work Responsibilities Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel) 
## Using index as id variables

line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables

Using machine learning to generate value for companies by applying it in novel areas or improving existing systems pays the most.

Job Role & Experience | Analysis Recommendations